Experimental design:
The experiment consisted of the following conditions: 1 wheat genotypes x 2 different soil types x 2 levels of SWC: high SWC (50% of soil water holding capacity (SWHC)); and low SWC (5–8% SWHC).
Experimental set-up:
Five replicate pots were filled with 1 kg of soil (see soil history), and eight seeds of wheat were sown. Pots were placed in a plant growth room based on a randomized complete block design. After one month, soil moisture treatments were applied by adjusting SWC to 5–8% SWHC, while controls were kept at 50% SWHC. After four weeks of growth under different moisture regimes rhizosphere samples were collected. Soil attached to the roots was considered as rhizosphere soil.
Soil history:
Soil managed under two different irrigation regimes were used: one from an irrigated and another from a directly adjacent non-irrigated experimental wheat field.
Research questions:
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_4.0.3 magrittr_1.5 tools_4.0.3 htmltools_0.5.0
## [5] yaml_2.2.1 stringi_1.5.3 rmarkdown_2.5 knitr_1.30
## [9] stringr_1.4.0 xfun_0.19 digest_0.6.27 rlang_0.4.8
## [13] evaluate_0.14
read_annot <- function(file) {
# this function takes an annotation.tsv file as input, selects defined columns to import,
# renames columns and converts it into a matrix, orders the matrix and subsets the dataset to 200 rows
df <- read.csv(file, sep = "\t", header = T, row.names = 2) [, c(2, 17, 19, 27, 31)]
df <- df[order(rownames(df)), ]
df <- df[1:200, ]
df <- df %>%
dplyr::rename(Phylum = tax_phylum, Genus = tax_genus) %>%
as.matrix()
}
annotations <- read_annot("annotations.tsv")
head(annotations, 5)
## product_name cog_name cog_category
## gene_id_1 "hypothetical" "NULL" "NULL"
## gene_id_10 "COG4221" "COG4221" "General function prediction only"
## gene_id_100 "Usp" "NULL" "NULL"
## gene_id_1000 "Sbm" "Sbm" "Lipid metabolism"
## gene_id_10000 "ABCB-BAC" "MdlB" "Defense mechanisms"
## Phylum Genus
## gene_id_1 "Actinobacteria" "Geodermatophilus"
## gene_id_10 "NULL" "NULL"
## gene_id_100 "NULL" "NULL"
## gene_id_1000 "NULL" "NULL"
## gene_id_10000 "Firmicutes" "Symbiobacterium"
read_abund <- function(file) {
# this function takes an abundance.tsv file as input, reorders columns and converts, it into a matrix,
# orders the matrix and subsets the dataset to 200 rows
df <- read.csv(file, sep = "\t", header = T, row.names = 1)
df <- df[order(rownames(df)), ]
df <- df[1:200, ]
df <- df %>%
relocate("NI_5_1", "NI_5_2", "NI_5_3", "NI_5_4", "NI_5_5", "NI_50_1", "NI_50_2", "NI_50_3",
"NI_50_4", "NI_50_5", "IR_5_1", "IR_5_2", "IR_5_3","IR_5_4", "IR_5_5", "IR_50_1",
"IR_50_2", "IR_50_3", "IR_50_4", "IR_50_5") %>%
as.matrix()
}
abundance_rel <- read_abund("merged_gene_abundance_renamed.tsv")
head(abundance_rel, 5)
## NI_5_1 NI_5_2 NI_5_3 NI_5_4 NI_5_5 NI_50_1 NI_50_2 NI_50_3
## gene_id_1 10 0 6 4 0 0 2 0
## gene_id_10 6 0 4 2 0 2 0 0
## gene_id_100 2 0 0 2 1 0 0 2
## gene_id_1000 4 0 8 4 2 2 2 4
## gene_id_10000 24 30 30 24 25 31 28 19
## NI_50_4 NI_50_5 IR_5_1 IR_5_2 IR_5_3 IR_5_4 IR_5_5 IR_50_1
## gene_id_1 2 2 0 0 4 0 0 0
## gene_id_10 8 2 8 2 0 9 2 4
## gene_id_100 1 7 0 2 5 2 6 3
## gene_id_1000 2 0 10 8 6 8 6 6
## gene_id_10000 36 17 20 18 24 32 40 20
## IR_50_2 IR_50_3 IR_50_4 IR_50_5
## gene_id_1 8 0 0 4
## gene_id_10 4 2 4 4
## gene_id_100 0 2 2 0
## gene_id_1000 8 18 10 6
## gene_id_10000 29 32 32 30
create_samdata <- function(x) {
# this function creates sample metadata as defined below
sample_data(data.frame(
perc_SWHC = rep(c("5", "50"), each = 5),
soil_type = rep(c("NI", "IR"), each = 10),
block = rep(c("1", "2", "3", "4", "5"), each = 1),
row.names = sample_names(x),
stringsAsFactors = FALSE
))
}
to_phyloseq <- function(otu, tax) {
# this function converts OTU, TAX, sample data into one global phyloseq object
OTU <- otu_table(otu, taxa_are_rows = TRUE)
TAX <- tax_table(tax)
physeq <- phyloseq(OTU, TAX)
sam_data <- create_samdata(physeq)
phyloseq(OTU, TAX, sam_data)
}
# create phyloseq object for relative abundance
ps_rel <- to_phyloseq(abundance_rel, annotations)
knitr::opts_chunk$set(fig.path = "/Users/ruthschmidt/Dropbox/Work/INRS/Data/metagenomics_wheat_soil/Output/alphadiv")
# calculate alpha diversity of relative data
alphadiv_rel <- estimate_richness(ps_rel, measures = c("Shannon", "Simpson"))
# get the metadata out as separate object
alphadiv_rel.meta <- meta(ps_rel)
# add rownames as a new colum for integration later
alphadiv_rel.meta$sam_name <- rownames(alphadiv_rel.meta)
# add the rownames to diversity table
alphadiv_rel$sam_name <- rownames(alphadiv_rel)
# merge these two data frames
alphadiv_rel.df <- merge(alphadiv_rel.meta, alphadiv_rel, by = "sam_name")
# convert to data frame
alphadiv_rel.melt <- reshape2::melt(alphadiv_rel.df)
## Using sam_name, perc_SWHC, soil_type, block as id variables
# perform three-way ANOVA on relative data
run_anova <- function(data, formula) {
aov(formula, data)
}
aov_rel.shannon <- alphadiv_rel.melt %>%
filter(variable == "Shannon") %>%
run_anova(value ~ perc_SWHC * soil_type + Error(block))
aov_rel.simpson <- alphadiv_rel.melt %>%
filter(variable == "Simpson") %>%
run_anova(value ~ perc_SWHC * soil_type + Error(block))
# write list of results and safe file
aov_all <- list(
shannon_rel = summary(aov_rel.shannon),
simpson_rel = summary(aov_rel.simpson)
)
alphadiv <- function(index) {
# this function takes alpha div file and filters based on input variable for plotting
alphadiv_rel.melt %>%
filter(variable == index) %>%
ggboxplot(
alpha = 1,
x = "soil_type",
y = "value",
fill = "perc_SWHC",
palette = c("#9ECAE1", "#2171B5"),
ylab = index,
xlab = "Soil type"
) +
# scale_y_log10() +
theme_pubclean(base_size = 14) +
labs(fill = "% SWHC")
}
p1 <- alphadiv("Shannon")
p2 <- alphadiv("Simpson")
# arrange plots into one
alphadiv_rel.p <- ggarrange(p1, p2,
ncol = 2, nrow = 1,
common.legend = TRUE, legend = "top"
) %>%
annotate_figure(
left = text_grob("Alpha diversity measure", rot = 90, size = 14)
)
alphadiv_rel.p
Filtering: Before doing any statistical analysis (except for alpha diversity), the data HAVE to be filtered and normalized, otherwise the dataset is hugely impacted by small count values, is zero inflated and overdispersed/ (latter can be assessed by Coefficient of Variation (CV), relative standard deviation or IQR). Here, I first filtered the relative abundance dataset using a low count filter of a minimum count of 4 and prevalence of 20% (meaning for any feature to be retained at least 20% of its values should contain at least 4 counts - this is generally recommended), and a filter of 2.0 for low variance (usually 3.0 is recommended but it removes too many counts, so I adjusted it). More on statistical analysis of metagenomics data here.
Normalization: After filtering, I transformed the dataset to absolute counts using the internal standard method described below in the code. After this, I end up with an absolute dataset that is being used for all consecutive statistical analysis.
knitr::opts_chunk$set(fig.path = "/Users/ruthschmidt/Dropbox/Work/INRS/Data/metagenomics_wheat_soil/Output/alphadiv")
# Low-count filter: removes features with small counts that are usually caused by sequencing errors. More explained here: https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-018-4637-6.
# Low count filter: prevalence in samples (%), min 4 counts in 20% of samples (generally, these are recommended standard settings)
ps_rel_filter <- filter_taxa(ps_rel, function(x) sum(x > 4) > (0.2 * length(x)), TRUE)
# rename for later
ps_rel <- ps_rel_filter
# normalization using internal standard: normalize counts to the number of reads per sample that mapped against the thermophilus ref genome
# read internal standard file, remove commas and convert to numeric data type
thermus <- data.frame(fread("qc_mapping_stats_renamed.tsv", sep = "\t", header = TRUE), check.names = FALSE)
thermus <- thermus[, c("sampleName", "totalReadsQCed", "properlyPaired")]
thermus$totalReadsQCed <- gsub(",", "", thermus$totalReadsQCed)
thermus$properlyPaired <- gsub(",", "", thermus$properlyPaired)
thermus$totalReadsQCed <- as.numeric(thermus$totalReadsQCed)
thermus$properlyPaired <- as.numeric(thermus$properlyPaired)
# get extraction masses
gram <- data.frame(fread("gram_renamed.txt", sep = "\t", header = T), check.names = FALSE)
colnames(gram) <- c("sampleName", "number", "weight") # ,"noThermophilusMolecules")
gram$number <- NULL
# merge the 2 tables
df <- merge(thermus, gram, by = "sampleName")
# compute conversion factor
df$properlyPairedByWeight <- df$properlyPaired * df$weight
# For Sa; 1 ng (amount of ng added per sample)*6.022×10^23 (avogadro nr) / 2127482 (length in bp of genome)*10^9 (to get to g)*650 (weight of 1 DNA bp) which results in 4.35E+06. This is per g of soil
df$noThermophilusMolecules <- (1 * 6.022E+23) / (2127482 * 1E+09 * 650)
# Here Sp = properlyPaired
# Sp = number of thermophilus genes
Sp <- 2218
# Sr = Ss /Sp
df$Sr <- df$properlyPaired / Sp
# Total number of genes (protein coding genes ; would have to make sure that all genes predicted by Prodigal are protein coding)
# Pg = (Ps x Sa) / Sr
Ps <- 2867789
df$Pg <- (df$noThermophilusMolecules * Ps) / df$Sr
# Loop all columns and perform normalization and create absolute abundance phyloseq obj
for (i in 1:ncol(ps_rel_filter@otu_table)) {
currCol <- ps_rel_filter@otu_table[, i, drop = FALSE]
currSampleID <- colnames(currCol)
currPg <- df[df$sampleName == currSampleID, ]$Pg
currWeight <- df[df$sampleName == currSampleID, ]$weight
currCol <- (currCol * currPg) / Ps # apply the formula
# Ga2 = Ga / Weight of sample in g to obtain molecules DNA per g.
currCol <- currCol / currWeight # divide by weight in g to get molecules DNA / g.
# Then replace original reads abundance with new normalized values.
ps_rel_filter@otu_table[, i] <- currCol
ps_abs <- ps_rel_filter
}
# remove objects
rm(df, gram, thermus, currCol, currPg, currSampleID, currWeight, Sp, Ps)
# remove NULL annotations
ps_rel <- subset_taxa(ps_rel, Phylum != "NULL") %>%
subset_taxa(Genus != "NULL") %>%
subset_taxa(cog_category != "NULL")
ps_abs <- subset_taxa(ps_abs, Phylum != "NULL") %>%
subset_taxa(Genus != "NULL") %>%
subset_taxa(cog_category != "NULL")
# check abs abundances
head(ps_abs@otu_table, 5)
## OTU Table: [5 taxa and 20 samples]
## taxa are rows
## NI_5_1 NI_5_2 NI_5_3 NI_5_4 NI_5_5 NI_50_1
## gene_id_10000 1630.584 1916.0986 2087.8273 1458.6502 1806.1358 2153.4033
## gene_id_100000 2174.112 894.1793 1391.8849 1458.6502 1661.6450 1041.9693
## gene_id_10000000 679.410 1277.3991 1252.6964 1154.7647 1083.6815 694.6462
## gene_id_10000004 407.646 894.1793 417.5655 425.4396 1011.4361 1250.3632
## gene_id_10000006 203.823 319.3498 695.9424 972.4335 866.9452 1111.4340
## NI_50_2 NI_50_3 NI_50_4 NI_50_5 IR_5_1 IR_5_2
## gene_id_10000 1928.2625 1149.0995 2462.4004 1333.6511 1317.7968 1620.8346
## gene_id_100000 1446.1969 1572.4519 1915.2003 1255.2010 1054.2375 810.4173
## gene_id_10000000 1239.5973 1814.3676 1436.4002 2039.7017 1713.1359 2251.1592
## gene_id_10000004 482.0656 725.7470 615.6001 1019.8508 790.6781 1080.5564
## gene_id_10000006 344.3326 241.9157 1162.8002 941.4008 790.6781 540.2782
## IR_5_3 IR_5_4 IR_5_5 IR_50_1 IR_50_2 IR_50_3
## gene_id_10000 2002.7992 2079.9945 2419.0273 1565.4898 2273.1131 2393.0786
## gene_id_100000 250.3499 974.9974 665.2325 939.2939 313.5328 747.8370
## gene_id_10000000 2002.7992 1819.9952 1511.8921 1487.2153 2194.7299 1495.6741
## gene_id_10000004 751.0497 714.9981 423.3298 939.2939 313.5328 523.4859
## gene_id_10000006 500.6998 1039.9973 725.7082 1017.5684 548.6825 448.7022
## IR_50_4 IR_50_5
## gene_id_10000 2256.2552 1672.4041
## gene_id_100000 1128.1276 1003.4425
## gene_id_10000000 1974.2233 1114.9361
## gene_id_10000004 352.5399 445.9744
## gene_id_10000006 1339.6515 390.2276
Taxonomy: To get an overview of top phyla and genera, I plotted bubble charts sorted by the mean abundance, and then subsetted the top 10 phyala and genera for bar charts. Here, Proteobacteria (mainly Burkholderia) are more abundant in the 50% SWHC in soil with drought history and Actinobacteria (Streptomyces) in 5% SWHC in soil with drought history.
# set colors for plotting
nb.cols <- 7
mycolors <- colorRampPalette(brewer.pal(9, "Blues"))(nb.cols)[c(3, 6)]
# plot absolute abundance bubble chart phylum
abs_bubble.p <- psmelt(ps_abs) %>%
mutate(Phylum = fct_reorder(Phylum, Abundance, .fun = "mean", .desc = F)) %>%
ggplot(aes(x = perc_SWHC, y = Phylum, size = Abundance, color = perc_SWHC)) +
geom_point(alpha = 1) +
scale_size(range = c(2, 10), name = "Abundance", labels = scientific) +
labs(x = "% SWHC", y = "Phylum") +
facet_grid(~soil_type) +
ggtitle("Absolute abundance Phyla") +
scale_color_manual(values = mycolors) +
theme_pubclean(base_size = 14) +
theme(legend.position = "right") +
guides(color = FALSE) # show color annotations
abs_bubble.p
# plot absolute abundance genus bubble chart
abs_bubble.g <- psmelt(ps_abs) %>%
mutate(Genus = fct_reorder(Genus, Abundance, .fun = "mean", .desc = F)) %>%
ggplot(aes(x = perc_SWHC, y = Genus, size = Abundance, color = perc_SWHC)) +
geom_point(alpha = 1) + # color = "#2171B5#
scale_size(range = c(2, 7), name = "Abundance", labels = scientific) +
labs(x = "% SWHC", y = "Genus") +
facet_grid(~soil_type) +
ggtitle("Absolute abundance Genera") +
scale_color_manual(values = mycolors) +
theme_pubclean(base_size = 14) +
theme(legend.position = "right") +
guides(color = FALSE) # show color annotations
abs_bubble.g
# subset top 10 phyla absolute
abs_glomp <- tax_glom(ps_abs, taxrank = "Phylum")
abs_top10p <- sort(tapply(taxa_sums(abs_glomp), tax_table(abs_glomp)[, "Phylum"], sum), decreasing = TRUE)[1:10]
abs_top10p <- subset_taxa(abs_glomp, Phylum %in% names(abs_top10p))
# subset top 10 genera absolute
abs_glomg <- tax_glom(ps_abs, taxrank = "Genus")
abs_top10g <- sort(tapply(taxa_sums(abs_glomg), tax_table(abs_glomg)[, "Genus"], sum), decreasing = TRUE)[1:10]
abs_top10g <- subset_taxa(abs_glomg, Genus %in% names(abs_top10g))
# create dataframe from phyloseq object
create_dataframe <- function(dataframe) {
data.table(psmelt(dataframe))
}
abs_top10p.df <- create_dataframe(abs_top10p)
abs_top10g.df <- create_dataframe(abs_top10g)
mycolors <- c("#9ECAE1", "#2171B5")
## plot top phyla absolute abundance
abs_box.top10.p <- psmelt(abs_top10p) %>%
mutate(Phylum = fct_reorder(Phylum, Abundance, .fun = "mean", .desc = T)) %>%
ggboxplot(
x = "soil_type",
y = "Abundance",
fill = "perc_SWHC",
palette = c("#9ECAE1", "#2171B5"),
ylab = "Absolute abundance (log)",
xlab = "Soil type",
title = "Absolute abundance top phyla",
legend = "right",
scales = "free"
) +
facet_grid(~Phylum) +
scale_y_log10() +
theme_pubclean(base_size = 14) +
labs(fill = "% SWHC")
abs_box.top10.p
## plot top genera absolute abundance
abs_box.top10.g <- psmelt(abs_top10g) %>%
mutate(Genus = fct_reorder(Genus, Abundance, .fun = "mean", .desc = TRUE)) %>%
ggboxplot(
x = "soil_type",
y = "Abundance",
fill = "perc_SWHC",
palette = c("#9ECAE1", "#2171B5"),
ylab = "Absolute abundance (log)",
xlab = "Soil type",
title = "Absolute abundance top genera",
legend = "right",
scales = "free"
) +
facet_grid(~Genus) +
scale_y_log10() +
theme_pubclean(base_size = 14) +
labs(fill = "% SWHC")
abs_box.top10.g
Beta diversity: For Beta diversity (similarity between samples), NMDS with Bray Curtis dissimilarity was chosen.
# calculate Bray-Curtis distance of abs and rel and plot NMDS
# set color range
my_blue <- c("#9ECAE1", "#2171B5")
ordu_abs <- ordinate(ps_abs, "NMDS", "bray")
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.144231
## Run 1 stress 0.1463678
## Run 2 stress 0.1625719
## Run 3 stress 0.1508395
## Run 4 stress 0.1821288
## Run 5 stress 0.1442322
## ... Procrustes: rmse 0.0004433232 max resid 0.001380195
## ... Similar to previous best
## Run 6 stress 0.1442313
## ... Procrustes: rmse 0.0002326814 max resid 0.0007363951
## ... Similar to previous best
## Run 7 stress 0.1463682
## Run 8 stress 0.1797306
## Run 9 stress 0.1628827
## Run 10 stress 0.1622457
## Run 11 stress 0.149901
## Run 12 stress 0.1508399
## Run 13 stress 0.1532714
## Run 14 stress 0.1748024
## Run 15 stress 0.1463678
## Run 16 stress 0.1583612
## Run 17 stress 0.1688748
## Run 18 stress 0.1611395
## Run 19 stress 0.1463678
## Run 20 stress 0.1442313
## ... Procrustes: rmse 0.0002517593 max resid 0.0007906352
## ... Similar to previous best
## *** Solution reached
# plot PcoA
pcoa_abs <- plot_ordination(ps_abs, ordu_abs, color = "perc_SWHC", shape = "soil_type") +
geom_point(size = 5, alpha = 0.75) +
theme_pubclean(base_size = 14) +
scale_colour_manual(values = my_blue, name = "% SWHC") +
scale_shape_discrete(name = "Soil type")
pcoa_abs
Differential abundance analysis: I used DESeq for DE analysis using a multi-factorial design (soil_type + perc_SWHC). See the static heatmap for the top 50 genes, and the table following after to look at their annotation. The DL_5 samples cluster together and you can see there are a few highly abundant in that cluster in red (all belonging to phages). I also created an interactive heatmap that includes the annotation directly. This is more to explore for now and then focus later on on gene id’s or cog categories. Most genes relate to phages and transposons, but there also some other interesting ones, like the COG2227 which encodes for a methyltransferase enzyme, that is part of the terpene biosynthesis pathway and is responsible for creating different terpene structures. Not surprisingly, almost all DE gens belong to Actinobacteria.
# Differential Abundance for Microbiome Data (McMurdie and Holmes (2014)) using DESeq2: http://joey711.github.io/phyloseq-extensions/DESeq2.html
# set conditions and (multi)-factorial design: As perc_SWHC is the variable of interest, put it at the end of the formula. Thus the results function will by default pull the perc_SWHC results unless other arguments are specified.
ps_abs_ds <- phyloseq_to_deseq2(ps_abs, ~ soil_type + perc_SWHC)
## converting counts to integer mode
# run DESeq with formula defined above
ps_abs_ds <- DESeq(ps_abs_ds) # fitType='local'
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
# extract significant gene_ids and join results with annotation from phyloseq object
# log2(n + 1) transform and subset to join with taxonomy table
ntd <- normTransform(ps_abs_ds)
subset_heatmap <- assay(ntd)
subset_heatmap <- as.data.frame(subset_heatmap)
tax.df <- as.data.frame(ps_abs@tax_table)
subset_tax <- tax.df %>%
rownames_to_column("row_names") %>%
semi_join(rownames_to_column(subset_heatmap, "row_names"), by = "row_names")
rownames(subset_tax) <- subset_tax$row_names
subset_tax <- subset_tax %>%
select(-row_names)
datatable(subset_tax)
# left join sub setted datasets for visualization using heatmaply
merged_heatmap <- merge(subset_heatmap, subset_tax, by = "row.names", all.x = F, all.y = F)
rownames(merged_heatmap) <- merged_heatmap$Row.names
merged_heatmap <- merged_heatmap %>% select(-Row.names) %>%
select(NI_5_1:IR_50_5, cog_category, Phylum, Genus)
# plot heatmap using heatmaply
heatmap.clust <- heatmaply::heatmaply(
heatmaply::percentize(merged_heatmap), # dend = "none", # activate if no clustering method
xlab = "Sample ID",
ylab = "Gene ID",
plot_method = "ggplot"
)
## Registered S3 method overwritten by 'dendextend':
## method from
## rev.hclust vegan
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust vegan
heatmap.clust